## Parsed with column specification:
## cols(
## user_id = col_character(),
## program_id = col_character(),
## series_id = col_character(),
## genre = col_character(),
## start_date_time = col_datetime(format = ""),
## streaming_id = col_character(),
## prog_duration_min = col_double(),
## time_viewed_min = col_double(),
## duration_more_30s = col_double(),
## time_viewed_more_5s = col_double(),
## percentage_program_viewed = col_double(),
## watched_more_60_percent = col_double(),
## month = col_double(),
## day = col_double(),
## hour = col_double(),
## weekend = col_character(),
## time_of_day = col_character()
## )
## Observations: 313,256
## Variables: 17
## $ user_id <chr> "cd2006", "cd2006", "cd2006", "cd2006", "cd…
## $ program_id <chr> "b8fbf2", "e2f113", "0e0916", "ca03b9", "cf…
## $ series_id <chr> "e0480e", "933a1b", "b68e79", "5d0813", "eb…
## $ genre <chr> "Comedy", "Factual", "Entertainment", "Spor…
## $ start_date_time <dttm> 2017-01-19 22:17:04, 2017-02-14 19:12:36, …
## $ streaming_id <chr> "1484864257965_1", "1487099603980_1", "1484…
## $ prog_duration_min <dbl> 1.850000, 0.500000, 1.366667, 1.616667, 8.5…
## $ time_viewed_min <dbl> 1.85000000, 0.49908333, 1.36666667, 1.61666…
## $ duration_more_30s <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ time_viewed_more_5s <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ percentage_program_viewed <dbl> 1.000000000, 0.998166667, 1.000000000, 1.00…
## $ watched_more_60_percent <dbl> 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0…
## $ month <dbl> 1, 2, 1, 2, 3, 2, 4, 3, 4, 3, 3, 4, 3, 3, 2…
## $ day <dbl> 5, 3, 4, 1, 1, 3, 1, 6, 7, 7, 1, 5, 7, 3, 2…
## $ hour <dbl> 22, 19, 21, 14, 20, 19, 20, 21, 21, 20, 18,…
## $ weekend <chr> "weekday", "weekday", "weekday", "weekend",…
## $ time_of_day <chr> "Evening", "Evening", "Evening", "Day", "Ev…
## cleaned_BBC_Data
##
## 17 Variables 313256 Observations
## --------------------------------------------------------------------------------
## user_id
## n missing distinct
## 313256 0 9634
##
## lowest : 0001c6 000c1a 001c53 001d44 002b2e, highest: ffec33 ffec4c fff1fd fffd0b ffffca
## --------------------------------------------------------------------------------
## program_id
## n missing distinct
## 313256 0 20772
##
## lowest : 0.00E+00 00045f 0006cf 0006ef 000f59
## highest: ffe7df ffef0a fff417 fff6de fffca7
## --------------------------------------------------------------------------------
## series_id
## n missing distinct
## 313256 0 4246
##
## lowest : 001b96 00389f 00400b 00434d 0079b7, highest: ff3bfd ff737e ff9060 ff96ec ffb887
## --------------------------------------------------------------------------------
## genre
## n missing distinct
## 313256 0 12
##
## Children (16294, 0.052), Comedy (28110, 0.090), Drama (77342, 0.247),
## Entertainment (24257, 0.077), Factual (88001, 0.281), Learning (1952, 0.006),
## Music (3924, 0.013), News (47448, 0.151), NoGenre (719, 0.002), RelEthics
## (1077, 0.003), Sport (17440, 0.056), Weather (6692, 0.021)
## --------------------------------------------------------------------------------
## start_date_time
## n missing distinct Info
## 313256 0 300690 1
## Mean Gmd .05 .10
## 2017-02-26 23:53:32 1970-02-10 14:11:42 2017-01-05 22:32:41 2017-01-11 05:42:28
## .25 .50 .75 .90
## 2017-01-27 09:19:00 2017-02-24 21:01:39 2017-03-29 18:00:58 2017-04-18 12:15:17
## .95
## 2017-04-24 19:14:02
##
## lowest : 2017-01-01 00:00:01 2017-01-01 00:00:03 2017-01-01 00:00:04 2017-01-01 00:00:05 2017-01-01 00:00:06
## highest: 2017-04-30 22:51:33 2017-04-30 22:52:27 2017-04-30 22:53:18 2017-04-30 22:53:51 2017-04-30 22:55:43
## --------------------------------------------------------------------------------
## streaming_id
## n missing distinct
## 313256 0 268031
##
## lowest : 1482085634977_2 1482088915673_2 1482224444938_1 1482432425783_2 1482620908113_1
## highest: 1493593235133_3 1493593282442_1 1493593664011_2 1493593706256_1 1493677079660_1
## --------------------------------------------------------------------------------
## prog_duration_min
## n missing distinct Info Mean Gmd .05 .10
## 313256 0 1143 0.985 49.63 39.73 0.5 5.0
## .25 .50 .75 .90 .95
## 30.0 45.0 60.0 85.0 120.0
##
## lowest : 0.5000000 0.5166667 0.5166667 0.5333333 0.5333333
## highest: 480.0000000 485.0000000 500.0000000 510.0000000 720.0000000
## --------------------------------------------------------------------------------
## time_viewed_min
## n missing distinct Info Mean Gmd .05 .10
## 313256 0 219956 1 18.02 22.42 0.1264 0.1662
## .25 .50 .75 .90 .95
## 1.0177 10.0007 28.0094 57.2412 59.0043
##
## lowest : 0.08333333 0.08333333 0.08335000 0.08336667 0.08336667
## highest: 279.66650000 290.00000000 321.49550000 335.00000000 355.00000000
## --------------------------------------------------------------------------------
## duration_more_30s
## n missing distinct Info Mean Gmd
## 313256 0 1 0 1 0
##
## Value 1
## Frequency 313256
## Proportion 1
## --------------------------------------------------------------------------------
## time_viewed_more_5s
## n missing distinct Info Mean Gmd
## 313256 0 1 0 1 0
##
## Value 1
## Frequency 313256
## Proportion 1
## --------------------------------------------------------------------------------
## percentage_program_viewed
## n missing distinct Info Mean Gmd .05 .10
## 313256 0 250689 0.998 0.4624 0.4497 0.002671 0.006227
## .25 .50 .75 .90 .95
## 0.053571 0.313905 0.966617 1.000000 1.000000
##
## lowest : 0.0001351620 0.0001606250 0.0001791667 0.0001928704 0.0001960417
## highest: 0.9999998039 0.9999998148 0.9999998611 0.9999999000 1.0000000000
## --------------------------------------------------------------------------------
## watched_more_60_percent
## n missing distinct Info Sum Mean Gmd
## 313256 0 2 0.711 120913 0.386 0.474
##
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 313256 0 4 0.935 2.398 1.275
##
## Value 1 2 3 4
## Frequency 94268 72745 73540 72703
## Proportion 0.301 0.232 0.235 0.232
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 313256 0 7 0.979 3.841 2.338
##
## Value 1 2 3 4 5 6 7
## Frequency 55897 44933 44470 43073 40814 40671 43398
## Proportion 0.178 0.143 0.142 0.138 0.130 0.130 0.139
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 313256 0 24 0.996 15.38 6.84 1 6
## .25 .50 .75 .90 .95
## 12 17 20 22 23
##
## lowest : 0 1 2 3 4, highest: 19 20 21 22 23
## --------------------------------------------------------------------------------
## weekend
## n missing distinct
## 313256 0 2
##
## Value weekday weekend
## Frequency 213961 99295
## Proportion 0.683 0.317
## --------------------------------------------------------------------------------
## time_of_day
## n missing distinct
## 313256 0 4
##
## Value Afternoon Day Evening Night
## Frequency 46075 81102 135965 50114
## Proportion 0.147 0.259 0.434 0.160
## --------------------------------------------------------------------------------
The column descriptions are as follows.
user_id – a unique identifier for the viewer
program_id and series_id – these identify the program and the series that the program belongs to
genre – the programme’s genre (e.g., drama, factual, news, sport, comedy, etc)
start_date_time – the streaming start date/time of the event
Streaming id – a unique identifier per streaming event
prog_duration_min – the program duration in minutes
time_viewed_min – how long the customer watched the program in minutes
duration_more_30s - equals 1 if the program duration is more than 30 seconds, equals 0 otherwise
time_viewed_more_5s - equals 1 if time_viewed is more than 5 seconds, equals 0 otherwise
percentage_program_viewed – percentage of the program viewed
watched_more_60_percent – equals 1 if more than 60% of the program is watched, equals 0 otherwise
month, day, hour, weekend – timing of the viewing
time_of_day – equals “Night” if the viewing occurs between 22 and 6AM, “Day” if it occurs between 6AM and 14, “Afternoon” if the it occurs between 14 and 17, “Evening” otherwise
Before we proceed let’s consider the usage in January only.
The data is presented to us in an event-based format (every row denotes the event that someone started streaming content from the iPlayer). Let’s change the current format to a customer-based dataset. In what dimensions could BBC iPlayer users be differentiated? Come up with variables that capture these from the data we are given.
#Let's find the number of shows on weekend and weekdays
userData2<-cleaned_BBC_Data %>% group_by(user_id,weekend) %>% summarise(noShows=n())
#Let's find percentage in weekend and weekday
userData3 <- userData2%>% group_by(user_id) %>% mutate(weight_pct = noShows / sum(noShows))
#Let's create a data frame with each user in a row.
userData3<-select (userData3,-noShows)
userData3<-userData3%>% spread(weekend,weight_pct,fill=0) %>%as.data.frame()
#Let's merge the final result with the data frame from the previous step.
userdatall<-left_join(userData,userData3,by="user_id")userData2<-cleaned_BBC_Data %>% group_by(user_id,time_of_day) %>% summarise(noShows=n()) %>% mutate(weight_pct = noShows / sum(noShows))
userData4<-select (userData2,-c(noShows))
userData4<-spread(userData4,time_of_day,weight_pct,fill=0)
userdatall<-left_join(userdatall,userData4,by="user_id")Find the proportion of shows watched in each genre by each user. Add it to the data frame
userdatalland useheadfunction to show first few rows.
userData2 <- cleaned_BBC_Data %>% group_by(user_id, genre) %>% summarise(noShows=n()) %>%
mutate(proportion = noShows / sum(noShows))
userData5 <- select(userData2, -c(noShows))
userData5 <- spread(userData5, genre, proportion, fill=0)
userdatall <- left_join(userdatall, userData5, by="user_id")
head(userdatall)## # A tibble: 6 x 21
## user_id noShows total_Time weekday weekend Afternoon Day Evening Night
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0001c6 2 16.7 1 0 0 0 1 0
## 2 000c1a 2 0.310 1 0 0 1 0 0
## 3 001c53 2 1.87 1 0 0 1 0 0
## 4 001d44 1 1.06 0 1 0 0 1 0
## 5 002b2e 31 534. 0.871 0.129 0 0.0968 0.806 0.0968
## 6 002b43 1 1.31 1 0 0 0 1 0
## # … with 12 more variables: Children <dbl>, Comedy <dbl>, Drama <dbl>,
## # Entertainment <dbl>, Factual <dbl>, Learning <dbl>, Music <dbl>,
## # News <dbl>, NoGenre <dbl>, RelEthics <dbl>, Sport <dbl>, Weather <dbl>
Add useful variable for differentiating viewers by creating new one with existing columns in the dataset.
#average time spent on each programme
cleaned_BBC_Data %>% group_by(user_id, percentage_program_viewed) %>% summarise(noShows = n()) %>% summarise(Loyalty = sum(percentage_program_viewed)/sum(noShows)) %>% ggplot(aes(x = Loyalty))+geom_density()userData2 <- cleaned_BBC_Data %>% group_by(user_id, percentage_program_viewed) %>% summarise(noShows = n()) %>% summarise(Loyalty = sum(percentage_program_viewed)/sum(noShows)) %>% mutate(Loyalty = if_else(Loyalty > 0.5, 1, 0))
userdatall <- left_join(userdatall, userData2, by = "user_id")
head(userdatall)## # A tibble: 6 x 22
## user_id noShows total_Time weekday weekend Afternoon Day Evening Night
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0001c6 2 16.7 1 0 0 0 1 0
## 2 000c1a 2 0.310 1 0 0 1 0 0
## 3 001c53 2 1.87 1 0 0 1 0 0
## 4 001d44 1 1.06 0 1 0 0 1 0
## 5 002b2e 31 534. 0.871 0.129 0 0.0968 0.806 0.0968
## 6 002b43 1 1.31 1 0 0 0 1 0
## # … with 13 more variables: Children <dbl>, Comedy <dbl>, Drama <dbl>,
## # Entertainment <dbl>, Factual <dbl>, Learning <dbl>, Music <dbl>,
## # News <dbl>, NoGenre <dbl>, RelEthics <dbl>, Sport <dbl>, Weather <dbl>,
## # Loyalty <dbl>
Comments:
I added the variable called ‘fanatic’ which shows the loyalty of customers to the programme by calculating average percentage of program viewed by each customer because it is important property of users. It tells us if the customer is loyal to the programmes they have watched or not. I calculated this variable by adding each users’ percentage of program viewed and dividing it by number of shows.
Let’s visualize the information captured in the user based data. Let’s start with the correlations.
library("GGally")
userdatall %>%
select(-user_id) %>%
ggcorr(method = c("pairwise", "pearson"), layout.exp = 3,label_round=2, label = TRUE,label_size = 2,hjust = 1)Observe the most correlated variables and the implication of this for the clustering.
The variables which are most correlated is noShow and total_Time. This is because if person watches higher number of shows it naturally increases the time of watching programmes. Also, the weekend and weekday variables are completely negatively correlated, which means that people who watch the programmes on weekend usually don’t spend time on weekdays to watch the shows.
This strong correlations implicate that those variables would be eliminated from the analysis dataset to create reasonable clustering analysis. This is because if the variables in the analysis are highly correlated, collinearity problem comes up. If certain variables are highly correlated, its weight becomes twice higher than other variables, so it messes up the clustering.
Investigate the distribution of noShows and total_Time using box plots (
geom_boxplot) and histograms (geom_histogram).
userdatall %>%
ggplot(aes(x = noShows, y = total_Time))+
geom_boxplot()+
labs(
x = "Number of shows",
y = "Total amount of time",
title = ""
)## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
userdatall %>%
ggplot(aes(x = noShows))+
geom_histogram()+
labs(
x = "Number of shows",
y = "Number of users",
title = "Total amount of shows watched for a month"
)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comments: In the box plot and the scatter plot, we can see that there are a huge number of outliers in this data in terms of the commitment of BBC iPlayer users. However, I believe that as the number of outliers are huge, it can be also a factor of determining separate clusters.
Delete the records for users whose total view time is less than 5 minutes and who views 5 or fewer programs. These users are not very likely to be informative.
Start training a K-Means model. I initate with 2 clusters and made sure de-select user_id variable and scale the data. Set max iterations to 100 and use 50 random starts. Displayed the cluster siezes and used summary("kmeans Object") to examine the components of the results of the clustering algorithm. (3046 amd 161 points in each cluster)
# Get rid of variables that we don't need and that are highly correlated with other variables
userdatall2 <- userdatall %>% select(-user_id,-noShows,-weekend, -Day)
# take log of total time because it is highly skewed
userdatall2 <- userdatall2 %>% mutate(total_Time=log(total_Time))
#scale the data
userdatall2 <- data.frame(scale(userdatall2))
#train kmeans clustering
model_km2 <- kmeans(userdatall2, centers = 2, nstart = 50, iter.max = 100)
summary(model_km2)## Length Class Mode
## cluster 3207 -none- numeric
## centers 36 -none- numeric
## totss 1 -none- numeric
## withinss 2 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 2 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
## [1] 3045 162
Plot the cluster centers. Describe the clusters that the algorithm suggests.
centre_locations <- userdatall_withClusters %>%
group_by(cluster) %>%
summarise_at(vars(total_Time:Loyalty), mean)
xa2<- gather(centre_locations, key = "variable", value = "value",-cluster,factor_key = TRUE)
knnCentres <- ggplot(xa2, aes(x = variable, y = value)) + geom_line(aes(color = cluster, group = cluster), linetype = "dashed", size = 1)+
geom_point(size=2,shape=4)+
geom_hline(yintercept=0)+
ggtitle("K-means centers k=2")+
labs(fill = "Cluster")+
theme(text = element_text(size=10),axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))
knnCentresThis two clusters basically show us two types of users who are using BBC iPlayer. Cluster 2 only contains 161 users out of 3207 users and it is mainly focused on people who are using the iPlayer for educational purposes, which we can see from the plot. The users in cluster 2 watches Children genre and Learning spending less time in total than people in cluster 1. They also uses this service during afternoon and day instead of night and evening, which means that this customers are mainly household that has children in their family. Thus, it is meaningful cluster in terms of that it differs customer with children in their household.
This cluster analysis can help BBC iPlayer management to see what type of programmes are popular to this particular type of customers with children. This type of clustering analysis can be useful for the marketing purposes and product improvement.
Plot a scatter plot for the viewers with respect to total_Time and weekday variables with color set to the cluster number of the user. What do we observe? Which variable seems to play a more prominent role in determining the clusters of users? Add an additional variable that should play a role in cluster assignments using size option.
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
a <- ggplot(userdatall_withClusters, aes(x = total_Time, y = weekday, color = as.factor(cluster)))+
geom_jitter()+
labs(
color = "Cluster"
)
ab <- ggplot(userdatall_withClusters, aes(x = total_Time, y = weekday, color = as.factor(cluster), size = Children))+
geom_jitter(alpha = 0.5)+
labs(
color = "Cluster"
)
b##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
It shows that ‘Weekday’ is not the variable that actually distinguishes the clusters 1 and 2. Meanwhile, ‘Children’ variable actually affects more to the cluster than ‘Weekday’. As we can see, the cluster 2 has all big size of points than cluster 1 does.
Repeat the previous step and use the first two principle components using fviz_cluster function.
library(factoextra)
#Several plots of PCA
fviz_cluster(model_km2, userdatall2, palette = "Set2", ggtheme = theme_minimal(), geom = "point")## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.771753e+00 9.843074e+00 9.843074
## Dim.2 1.705520e+00 9.475111e+00 19.318185
## Dim.3 1.523506e+00 8.463923e+00 27.782108
## Dim.4 1.364902e+00 7.582789e+00 35.364897
## Dim.5 1.316230e+00 7.312387e+00 42.677283
## Dim.6 1.194488e+00 6.636045e+00 49.313328
## Dim.7 1.147612e+00 6.375620e+00 55.688948
## Dim.8 1.058451e+00 5.880281e+00 61.569229
## Dim.9 1.005826e+00 5.587925e+00 67.157154
## Dim.10 9.849678e-01 5.472043e+00 72.629197
## Dim.11 9.285727e-01 5.158737e+00 77.787934
## Dim.12 9.004910e-01 5.002728e+00 82.790662
## Dim.13 8.003021e-01 4.446123e+00 87.236785
## Dim.14 7.101773e-01 3.945429e+00 91.182214
## Dim.15 6.305257e-01 3.502921e+00 94.685134
## Dim.16 6.113851e-01 3.396584e+00 98.081718
## Dim.17 3.452907e-01 1.918282e+00 100.000000
## Dim.18 1.765332e-30 9.807400e-30 100.000000
# Scree plot with histrogram
ScreePlot1 <- fviz_screeplot(pca_bbc, addlabes = TRUE)
fviz_eig(pca_bbc, choice = "variance", addlabes = TRUE)## Warning in fviz_pca_contrib(pca_bbc, choice = "var"): The function
## fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which
## can handle outputs of PCA, CA and MCA functions.
# Scree plot - Eigenvalues
ScreePlot3 <- fviz_eig(pca_bbc, choice = "eigenvalue", addlabels=TRUE)
# Use only bar or line plot: geom = "bar" or geom = "line"
ScreePlot4 <- fviz_eig(pca_bbc, geom="line")
# eigenvectors on 2D circle (two main PCA)
ScreePlot5 <- fviz_pca_var(pca_bbc, choice = "variance", addlabes = TRUE, ggtheme = theme_minimal())
ScreePlot1I am not worried about the outliers because I eliminated all the variables that are highly correlated including ‘Day’. I eliminated ‘Day’ variable because when I included it and conducted PCA, I could see that ‘Day’ and ‘Evening’ became the two highest contribution variables which doesn’t make a good sense of creating reasonable clusters. Now the two most contributable variables are ‘Children’ and ‘News’ to differ the clusters (we can see this on the PCA screep plot). Thus, I believe that the two clusters are not highly affected by outliers from ‘total time’ variable.
Produce an elbow chart and identify a reasonable range for the number of clusters.
library(purrr)
# Use map_dbl to run many models with varying value of k (centers)
kmax_elbow=15
tot_withinss <- map_dbl(1:kmax_elbow, function(k){
model <- kmeans(x = userdatall2, centers = k,iter.max = 100,nstart = 40)
model$tot.withinss
})
# Generate a data frame containing both k and tot_withinss
elbow_df <- data.frame(
k = 1:kmax_elbow ,
tot_withinss = tot_withinss
)
# Plot the elbow plot
ggplot(elbow_df, aes(x = k, y = tot_withinss))+
geom_line()+
scale_x_continuous(breaks = 1:15)Repeat the previous step for Silhouette analysis.
Summary of the conclusion from Elbow Chart and Silhouette analysis.
From the elbow chart, I can’t see the steep part where I can easily choose the proper number of clusters k. The slope decreases with the similar slope for any number of clusters. From silhouette method, I can see that cluster number 10 has the highest silhouette width. However, as the clusters overlap as the number of clusters increases, I would rather choose the number of clusters seeing the visualisation of clusters based on PCA.
#Fit kmeans models with k =3,4,5
model_km2 <- eclust(userdatall2, "kmeans", k = 2,nstart = 50, graph = FALSE)
model_km2$size## [1] 162 3045
## [1] 160 776 2271
## [1] 747 163 996 1301
## [1] 163 778 629 905 732
# plots to compare
#I use the fviz_cluster function which is part of the`factoextra` library
p1 <- fviz_cluster(model_km2, geom = "point", data = userdatall2) + ggtitle("k = 2")
p2 <- fviz_cluster(model_km3, geom = "point", data = userdatall2) + ggtitle("k = 3")
p3 <- fviz_cluster(model_km4, geom = "point", data = userdatall2) + ggtitle("k = 4")
p4 <- fviz_cluster(model_km5, geom = "point", data = userdatall2) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1,p2,p3,p4, nrow = 2)According to the 4 plots, I can see that there are certain cluster that does not change as the number of cluster increases. As the most discrete cluster doesn’t change, it would be nicer to choose K as number 3. This is because the other part (left upper part) overlaps as the number of cluster increases.
#Plot centers
xa<-data.frame(cluster=as.factor(c(1:2)),model_km2$centers)
xa2k3<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn2<-ggplot(xa2k3, aes(x = variable, y = value))+ geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=2")
#Plot centers for k=3
xa<-data.frame(cluster=as.factor(c(1:3)),model_km3$centers)
xa2k3<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn3<-ggplot(xa2k3, aes(x = variable, y = value))+ geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=3")
#Plot centers for k=4
xa<-data.frame(cluster=as.factor(c(1:4)),model_km4$centers)
xa4<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn4<-ggplot(xa4, aes(x = variable, y = value))+ geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")
#Plot centers for k=5
xa<-data.frame(cluster=as.factor(c(1:5)),model_km5$centers)
xa2<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn5<-ggplot(xa2, aes(x = variable, y = value))+ geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=5")
grid.arrange(graphknn2, graphknn3,graphknn4,graphknn5, nrow = 4)If we see these 4 plots, it seems reasonable to set the ideal number of clusters as 4. Because this 4 clusters clearly shows the different segments of people such as people who are into Drama, Children & Learning, and News & Weather. Now I believe that I should have set the new variable more carefully, thus Loyalty variable does not contribute to differ the specific segment of customers. I personally think that the total_Time should have worked as a segment as well because some individuals are just spending a lot of time watching programmes without certain purposes or preferences. (e.g. TVs in the stores or people who just turn on the TV whole day)
Fit a PAM model for the k value I chose above for k-means. Determine how many points each cluster has. Plot the centers of the clusters and produce the PCA visualization.
k = 4
#Fit a PAM model
k4_pam <- eclust(userdatall2, "pam", k = k, graph = FALSE)
#Check the cluster sizes
k4_pam$clusinfo## size max_diss av_diss diameter separation
## [1,] 1340 21.32385 3.057657 24.97547 0.5570248
## [2,] 576 11.58818 3.552756 17.26607 1.0263983
## [3,] 1141 16.17630 4.061424 21.89103 0.5570248
## [4,] 150 15.31108 4.789563 21.04540 1.3882904
#Plot the centers
bbc_withClusters <- mutate(userdatall2,
cluster = as.factor(k4_pam$cluster))
center_locations <- bbc_withClusters%>% group_by(cluster) %>%
summarize_at(vars(total_Time:Loyalty),mean)
xa2p <- gather(center_locations, key = "variable", value = "value",-cluster,factor_key = TRUE)
pamcenters <- ggplot(xa2p, aes(x = variable, y = value))+ geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=2,shape=4)+geom_hline(yintercept=0)+ggtitle(paste("PAM Centers k=",k))+labs(fill = "Cluster")+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+scale_colour_manual(values = c("darkgreen", "orange", "red","blue"))
pamcenters#PCA visualisation
fviz_cluster(model_km4, data = usderdatall2, geom = "point") + ggtitle("K-means k = 4")+scale_colour_manual(values = c("darkgreen", "orange", "red","blue"))Use Hierercahial clustering with the same k I chose above. Set hc_method equal to average and then ward.D to tell the difference between two methods. Visualize the results (called dendrogram) using plot function to observe the differences. How many points does each cluster have? Plot the centers of the clusters and produce PCA visualization for ward.D.
#Find distances between points
res.dist <- dist(userdatall2, method = "euclidean")
#Fit the model
res.hc <- hcut(res.dist, hc_method = "average", k=4)
res.hc2 <- hcut(res.dist, hc_method = "ward.D", k=4)
#Size of the clusters
res.hc$size## [1] 3196 1 8 2
## [1] 1702 916 437 152
## Length Class Mode
## merge 6412 -none- numeric
## height 3206 -none- numeric
## order 3207 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
## cluster 3207 -none- numeric
## nbclust 1 -none- numeric
## silinfo 3 -none- list
## size 4 -none- numeric
## data 5140821 dist numeric
## cluster size ave.sil.width
## 1 1 3196 0.58
## 2 2 1 0.00
## 3 3 8 0.61
## 4 4 2 0.90
## Length Class Mode
## merge 6412 -none- numeric
## height 3206 -none- numeric
## order 3207 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
## cluster 3207 -none- numeric
## nbclust 1 -none- numeric
## silinfo 3 -none- list
## size 4 -none- numeric
## data 5140821 dist numeric
## cluster size ave.sil.width
## 1 1 1702 0.13
## 2 2 916 -0.07
## 3 3 437 0.16
## 4 4 152 0.16
Plot the centers of H-clusters and compare the results with K-Means and PAM.
#First let's find the averages of the variables by cluster
userdata_withClusters<-mutate(userdatall2, cluster = as.factor(res.hc$cluster))
center_locations <- userdata_withClusters%>% group_by(cluster) %>% summarize_at(vars(total_Time:Loyalty),mean)
#Next I use gather to collect information together
xa2<- gather(center_locations, key = "variable", value = "value",-cluster,factor_key = TRUE)
#Next I use ggplot to visualize centers
hclust_center<-ggplot(xa2, aes(x = variable, y = value,order=cluster))+ geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=2,shape=4)+geom_hline(yintercept=0)+ggtitle("H-clust K=4")+labs(fill = "Cluster")+scale_colour_manual(values = c("darkgreen", "orange", "red","blue"))+theme(text = element_text(size=10), axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))
## Compare it with KMeans and PAM
graphknn4<-ggplot(xa4, aes(x = variable, y = value))+ geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")
hclust_centerComments: HD clustering and Kmeans both correclt identify the ‘children’ category. In HD clustering the “news/sport/weather” type of user is still visible and it is not characterized anymore by a specific time of the day compared to Kmeans where ‘Day’ shows high reading. The category “spend lot of time in forn of TV and has no preferences in term of genre” is there using both clustering methods. Based on the three methods above, I can see that the segments are well differed by its genres. I can see that the clusters of PAM and K-means are very similar and the H-clust shows fairly differen results
We have chosen the number of clusters by now. We will try to reinforce the conclusions and verify that they are not due to chance by dividing the data into two equal parts. Use K-means clustering, fixing the number of clusters, at find the clusters in these two data sets separately. If we get similar looking clusters, we can rest assured that our conclusions are robust. If not we might want to reconsider the decision.
set.seed(1234)
library(rsample)
train_test_split <- initial_split(userdatall2, prop = 0.5)
testing <- testing(train_test_split) #50% of the data is set aside for testing
training <- training(train_test_split) #50% of the data is set aside for training
#Fit k-means to each data set
model_testing <- kmeans(testing, centers = 4, nstart = 50, iter.max = 100)
model_training <- kmeans(training, centers = 4, nstart = 50, iter.max =100)
#Plot centers
userdatall_withClusters <- mutate(userdatall_withClusters, cluster_pam = as.factor(k4_pam$clustering))
center_locations <- userdatall_withClusters %>% group_by(cluster_pam) %>%
summarise_at(vars(total_Time:Loyalty),mean)
#Plot for training data
training_center <-data.frame(cluster=as.factor(c(1:4)),model_training$centers)
training_center <-training_center %>% gather(variable,value,-cluster,factor_key = TRUE)
training_plot<-ggplot(training_center, aes(x = variable, y = value))+ geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")
#Plot for testing data
testing_center <-data.frame(cluster=as.factor(c(1:4)),model_testing$centers)
testing_center <-testing_center %>% gather(variable,value,-cluster,factor_key = TRUE)
testing_plot<-ggplot(testing_center, aes(x = variable, y = value))+ geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")
training_plotI think I have chosen the right number of k because it shows the consistent results with the testing and training dataset. Also, from the plots I have created, I could observe that the 4 clusters showed different preferences for the genres especially for the Children & Learning genres. I have done good job at finding people who are dedicated to certain genres. However, I could have improved my analysis by adding new variable to make a separate cluster for the people who just spend way more time than others who are fanatic to BBC iPlayer. Based on this data, BBC can improve their programme qualities conducting separate analysis by user segments (Learning&Children, Drama, News&Weather)